In [2]:
import pandas as pd
import numpy as np
import cufflinks as cf
import seaborn as sns
import matplotlib.pyplot as plt 

cf.go_offline()

Load data

In [4]:
url = r'/Users/Divir/Desktop/Geeklife/medical-malpractice-ml/NPDB1804.csv'
df = pd.read_csv(open(url, 'rb'), low_memory=False)
df.head()
Out[4]:
SEQNO RECTYPE REPTYPE ORIGYEAR WORKSTAT WORKCTRY HOMESTAT HOMECTRY LICNSTAT LICNFELD ... ACCRRPTS NPMALRPT NPLICRPT NPCLPRPT NPPSMRPT NPDEARPT NPEXCRPT NPGARPT NPCTMRPT FUNDPYMT
0 1 A 301 1991 OK NaN NaN NaN OK 10 ... 0 0 2 0 0 0 0 0 0 NaN
1 2 A 301 1991 OK NaN NaN NaN OK 10 ... 0 0 7 0 0 0 1 0 0 NaN
2 4 A 301 1991 MA NaN NaN NaN MA 15 ... 0 1 1 0 0 0 2 0 0 NaN
3 6 A 301 1990 OK NaN NaN NaN OK 10 ... 0 0 2 0 0 0 0 0 0 NaN
4 8 A 301 1990 OK NaN NaN NaN OK 10 ... 0 0 7 0 1 0 0 0 0 NaN

5 rows × 54 columns

Filter and clean data

In [5]:
# 1. Payment records only - RECTYPE = P
# 2. Non-state payments - FUNDPYMT = 0

df = df[(df['RECTYPE'] == 'P') & (df['FUNDPYMT'] == 0)]
df['PAYMENT'] = df['PAYMENT'].str.replace('$', '').astype(float)

Payments distribution by report type

In [6]:
ins_payments = df.loc[df['REPTYPE'] == 101, 'PAYMENT']
non_ins_payments = df.loc[df['REPTYPE'] == 102, 'PAYMENT']

fig, ax = plt.subplots(1, 2)

sns.distplot(ins_payments, ax=ax[0], hist_kws={'log': True}, kde=False, color='skyblue')
ax[0].set_title('Insurance Company Payments')
sns.distplot(non_ins_payments, ax=ax[1], hist_kws={'log': True}, kde=False, color='green')
ax[1].set_title('Non-Insurance Company Payments');

There doesn't seem to be much of a difference between insurance and non-insurance report payments. Both seem to have some big outliers (note that the axis is logarithmic).

In [15]:
pmt_by_year = df.groupby('ORIGYEAR')['PAYMENT'].sum()
pmt_by_year.iplot(kind='bar', title='Payments by Year', xTitle='Year', yTitle='Total payments')

Total payments don't seem to have changed much year over year either.

Payment by region

In [8]:
regions = {
    'West': ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID'],
    'South West': ['AZ', 'TX', 'NM', 'OK'],
    'South East': ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ],
    'Mid West': ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND'],
    'North East': ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME'],
}

state_to_region = {state: region for region, states in regions.items() for state in states}
df['REGION'] = df['WORKSTAT'].map(state_to_region)
In [9]:
pmt_by_region = df.groupby('REGION')[['PAYMENT']].mean().reset_index()
pmt_by_region.iplot(kind='pie', title='Payment by region', labels='REGION', values='PAYMENT')

The North East and Mid West tend to account for about half of all the payments. This could be because of stricter laws in these states, but we would have to look these us further.

In [10]:
labels = ['XS', 'SM', 'MD', 'LG', 'XL']
df['PMTGROUP'] = pd.qcut(df['PAYMENT'], q=5, labels=labels)
In [11]:
pmt_group_by_region = df.groupby('REGION')['PMTGROUP'].value_counts(normalize=True).unstack()
pmt_group_by_region = pmt_group_by_region[labels]
pmt_group_by_region.iplot(kind='bar', barmode='stack')

The North East, which is the largest piece of the pie above, tends to have very large payments compared to other regions. Again, this could be due to regulations in these states.

In [12]:
age_map = {
    -1: 'fetus',
    0: 'baby',
    70: 'elder',
    80: 'elder',
    90: 'elder'
}

df['PTAGEGROUP'] = df['PTAGE'].map(age_map)
groups = df.groupby('REGION')['PTAGEGROUP'].value_counts(normalize=True)
age_stats = groups.unstack()[['fetus', 'baby', 'elder']]
age_stats.iplot(kind='bar', title='Patient age by region')

We can see that the Mid West has about twice as many reports of fetuses as the West. We might expect payments to be higher for reports with fetuses, which we can explore next.

In [13]:
groups = df.groupby('PTAGEGROUP')['PAYMENT']
pmt_by_age_group = groups.describe().unstack()
pmt_by_age_group = pmt_by_age_group.loc[['fetus', 'baby', 'elder'], ['mean', 'std']]
pmt_by_age_group.T.iplot(kind='bar', )

Indeed payments for fetuses and babies tend to be much higher and with more variation.

Variables to predict

  • Amount of Reported Payment - PAYMENT or TOTALPMT

Possible predictors

  1. Work or license state - WORKSTAT or LICNSTAT - (category)
  2. License field - LICNFELD - (category)
  3. Practitioner's age group - PRACTAGE - (number)
  4. Malpractice Allegation Group - ALGNNATR (ALEGATN1 or ALEGATN2) - (category)
  5. Severity of Alleged Malpractice Injury - OUTCOME - (category)
  6. Number of practioners - NUMBPRSN - (number)
  7. Relationship of Paying Entity to the Practitioner - PYRRLTNS - (category)
  8. Patient Age in Groups of Year - PTAGE - (number)
  9. Gender of Patient - PTGENDER - (category)
  10. Patient Typ - PTTYPE - (category)
  11. Type of Reporting Entity - TYPE - (category)
  12. Subject's Number of Malpractice Payment Report - NPxxxRPT e.g. NPMALRPT - (number)

Variables to avoid

  • Payment a Result of Judgment or Settlement - PAYTYPE